from gensim import corpora
from collections import defaultdict
from gensim import models

from gensim import corpora
from collections import defaultdict
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

# 去掉停用词
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# 去掉只出现一次的单词
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

dictionary = corpora.Dictionary(texts)   # 生成词典

# 将文档存入字典，字典有很多功能，比如
# diction.token2id 存放的是单词-id key-value对
# diction.dfs 存放的是单词的出现频率
dictionary.save('./deerwester.dict')  # store the dictionary, for future reference
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('./deerwester.mm', corpus)  # store to disk, for later use

# 过滤掉出现频率最高的N个单词
dictionary.filter_n_most_frequent(20)

# 1.去掉出现次数低于no_below的
# 2.去掉出现次数高于no_above的。注意这个小数指的是百分数
# 3.在1和2的基础上，保留出现频率前keep_n的单词
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# 有两种用法，一种是去掉bad_id对应的词，另一种是保留good_id对应的词而去掉其他词。注意这里bad_ids和good_ids都是列表形式
dictionary.filter_tokens(bad_ids=None, good_ids=None)

# 在执行完前面的过滤操作以后，可能会造成单词的序号之间有空隙，这时就可以使用该函数来对词典来进行重新排序，去掉这些空隙。
dictionary.compactify()

import os
from gensim import corpora, models, similarities
from pprint import pprint
from matplotlib import pyplot as plt
import logging

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def PrintDictionary(dictionary):
    token2id = dictionary.token2id
    dfs = dictionary.dfs
    token_info = {}
    for word in token2id:
        token_info[word] = dict(
            word = word,
            id = token2id[word],
            freq = dfs[token2id[word]]
        )
    token_items = token_info.values()
    token_items = sorted(token_items, key = lambda x:x['id'])
    print('The info of dictionary: ')
    pprint(token_items)
    print('--------------------------')

def Show2dCorpora(corpus):
    nodes = list(corpus)
    ax0 = [x[0][1] for x in nodes] # 绘制各个doc代表的点
    ax1 = [x[1][1] for x in nodes]
    # print(ax0)
    # print(ax1)
    plt.plot(ax0,ax1,'o')
    plt.show()

if (os.path.exists("./deerwester.dict")):
    dictionary = corpora.Dictionary.load('./deerwester.dict')
    corpus = corpora.MmCorpus('./deerwester.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

PrintDictionary(dictionary)

# 尝试将corpus(bow形式) 转化成tf-idf形式
tfidf_model = models.TfidfModel(corpus) # step 1 -- initialize a model 将文档由按照词频表示 转变为按照tf-idf格式表示
doc_bow = [(0, 1), (1, 1),[4,3]]
doc_tfidf = tfidf_model[doc_bow]

# 将整个corpus转为tf-idf格式
corpus_tfidf = tfidf_model[corpus]
# pprint(list(corpus_tfidf))
# pprint(list(corpus))

## LSI模型 **************************************************
# 转化为lsi模型, 可用作聚类或分类
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi_model[corpus_tfidf]
nodes = list(corpus_lsi)
# pprint(nodes)
lsi_model.print_topics(2) # 打印各topic的含义

# ax0 = [x[0][1] for x in nodes] # 绘制各个doc代表的点
# ax1 = [x[1][1] for x in nodes]
# print(ax0)
# print(ax1)
# plt.plot(ax0,ax1,'o')
# plt.show()

lsi_model.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi_model = models.LsiModel.load('/tmp/model.lsi')
#  *********************************************************

## LDA模型 **************************************************
lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lda = lda_model[corpus_tfidf]
Show2dCorpora(corpus_lsi)
# nodes = list(corpus_lda)
# pprint(list(corpus_lda))

# 此外，还有Random Projections, Hierarchical Dirichlet Process等模型

corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
# 计算一个新的文本与既有文本的相关度
test_text = "Human computer interaction".split()
test_bow = dictionary.doc2bow(test_text)
test_tfidf = tfidf_model[test_bow]
test_lsi = lsi_model[test_tfidf]
test_simi = corpus_simi_matrix[test_lsi]
print(list(enumerate(test_simi)))